#!/usr/bin/env python
# -*- coding:utf-8 -*-

from lib.utils.check import *
from lib.utils.printer import *
from urllib.parse import unquote_plus
from re import search,findall,I
from lib.request.request import *
from urllib.parse import urlsplit,urlunparse
from bs4 import BeautifulSoup

#排除扩展名
EXCLUDED_MEDIA_EXTENSIONS = (
    '.7z', '.aac', '.aiff', '.au', '.avi', '.bin', '.bmp', '.cab', '.dll', '.dmp', '.ear', '.exe', '.flv', '.gif',
    '.gz', '.image', '.iso', '.jar', '.jpeg', '.jpg', '.mkv', '.mov', '.mp3', '.mp4', '.mpeg', '.mpg', '.pdf', '.png',
    '.ps', '.rar', '.scm', '.so', '.tar', '.tif', '.war', '.wav', '.wmv', '.zip'
)

class SCrawler(Request):
    def __init__(self,kwargs,url,data):
        Request.__init__(self,kwargs)
        self.url = url
        self.data = data
        self.forms = []
        self.ok_links = []
        self.all_links = []
        self.scheme = urlsplit(url).scheme
        self.netloc = urlsplit(url).netloc
        self.content = None

    def run(self):
        resp = self.Send(url=self.url,data=self.data)
        self.content = resp.content
    
    @property
    def extract(self):
        for tag in self.soup.findAll('a',href=True):
            self.all_links.append(tag['href'].split("#")[0])
        
        for tag in self.soup.findAll(['frame','iframe'],src=True):
            self.all_links.append(tag['src'].split("#")[0])
        
        

    @property
    def soup(self):
        soup = BeautifulSoup(self.content)
        return soup
    
    def form(self):
        for form in self.soup.findAll('form'):
            if form not in self.forms:
                self.forms.append(form)
        for form in self.forms:
            if form != "" and form != None:
                pass
    def extract_form(self,form,url):
        query = []
        action = ""
        method = ""
        try:
            method += self.check_method(findall(r'method=[\'\"](.+?)[\'\"]',form,I))
            action += self.check_action((findall(r'method=[\'\"](.+?)[\'\"]',form,I),url))
        except Exception as e:
            pass
        for inputs in form.split("/>"):
            if search(r'\<input',inputs,I):
                try:
                    name = self.check_name_value(findall(r'name=[\'\"](.+?)[\'\"]',inputs,I))
                    value = self.check_name_value(findall(r'value=[\'\"](.+?)[\'\"]',inputs,I))
                    name_value = "%s=%s"%(name,value)
                    if len(query) == 0:query.append(name_value)
                    if len(query) == 1:query[0] += "&%s"%(name_value)
                except Exception as e:
                    pass
        if action:
            if method.lower() == "get":
                if query != []:
                    v = "%s?%s"%(action,query[0])
                    return v
                return action
            elif method.lower() == "post":
                if query != []:
                    return action,query[0]
                return action
    def absolute(self,link):
        link = self.check_ext(link)
        parts = urlsplit(link)

        scheme = parts.scheme
        netloc = parts.netloc
        path = parts.path or '/'
        query = parts.query

        if scheme == 'http' or scheme == 'https':
            if netloc != "":
                if netloc in self.netloc:
                    return urlunparse((scheme,netloc,path,'',query,''))
        elif link.startswith("//"):
            if netloc != "":
                if self.netloc in netloc:
                    return urlunparse((self.scheme,netloc,(path or '/'),'',query,''))
        elif link.startswith("/"):
            return urlunparse((self.scheme,self.netloc,path,'',query,''))
        elif link.startswith("?"):
            return urlunparse((self.scheme,self.netloc,path,'',query,''))
        elif link == "" or link.startswith("#"):
            return self.url
        else:
            return urlunparse((self.scheme,self.netloc,path,'',query,''))
        
    def check_ext(self,link):
        if link not in EXCLUDED_MEDIA_EXTENSIONS:
            return link
    
    def check_method(self,method):
        if method != []:
            return "GET"
        else:
            return method[0]
    
    def check_url(self,url):
        url = unquote_plus(url)
        url = url.replace("&amp;","&")
        url = url.replace("#","")
        url = url.replace(" ","+")
        return url
    
    def check_action(self,action,url):
        if action == [] or action[0] == "/":
            return self.check_url(url)
        elif action !=[] and action != "":
            if action[0] in url:
                self.check_url(url)
            else:
                return self.check_url(Cpath(url+action[0]))
    
    def check_name_value(self,string):
        if string == []:
            return "TEST"
        elif string != []:
            return string[0]